rm(list=ls())

0.1 Importing library

## Importing library
### List of required packages
required_packages <- c("tidyverse","janitor" ,"readr","dplyr","haven","sf", "flextable","sp", "factoextra", "FactoMineR","gtsummary", "sjPlot", "fastDummies","ggthemes","spdep","patchwork")

# Check if packages are installed
missing_packages <- setdiff(required_packages, installed.packages()[,"Package"])

### Install missing packages
if (length(missing_packages) > 0) {
  install.packages(missing_packages)
}

### Load all packages
lapply(required_packages, library, character.only = TRUE)
# Read shapefile data for 2002 and 2002


MPI_data_dr_2002 <- sf::read_sf(paste0(here::here(),"/output/output_data/MPI_data_dr_2002.shp"))
MPI_data_dr_2013 <- sf::read_sf(paste0(here::here(),"/output/output_data/MPI_data_dr_2013.shp"))

1 Regression modeling with Senegal Census data

#"nb_indv",
predictors = c("nbr_mng","nbr_dc_p","nbr_dc_m","nbr_dc_sc","nbr_dc_sp","mdn_cm_","mn_cm_g","nbr_cm_h","nbr_cm_f","pct_cm_")
outcome = "MPI_ndx"

1.1 Inspecting the outcome variable (MPI) with visualization

mhv_map <- ggplot(MPI_data_dr_2013, aes(fill = MPI_ndx)) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c() + 
  theme_void() + 
  labs(fill = "MPI ")

mhv_histogram <- ggplot(MPI_data_dr_2013, aes(x = MPI_ndx)) + 
  geom_histogram(alpha = 0.5, fill = "navy", color = "navy",
                 bins = 100) + 
  theme_minimal() + 
  scale_x_continuous(labels = scales::label_number_si(accuracy = 0.1)) + 
  labs(x = "MPI")

mhv_map + mhv_histogram + labs(title = "MPI value charts for Senegal Census 2002")

mhv_map_log <- ggplot(MPI_data_dr_2013, aes(fill = log(MPI_ndx))) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c() + 
  theme_void() + 
  labs(fill = "MPI\nvalue (log)")

mhv_histogram_log <- ggplot(MPI_data_dr_2013, aes(x = log(MPI_ndx))) + 
  geom_histogram(alpha = 0.5, fill = "navy", color = "navy",
                 bins = 100) + 
  theme_minimal() + 
  scale_x_continuous() + 
  labs(x = "MPI (log)")

mhv_map_log + mhv_histogram_log + labs(title = "Logged MPI value charts for Senegal Census 2002")

1.2 A first regression model

library(sf)
library(units)
predictors = c("nbr_mng","nbr_dc_p","nbr_dc_m","nbr_dc_sc","nbr_dc_sp","mn_cm_g","pct_cm_","nb_indv")
outcome = "MPI_ndx"
MPI_data_dr_2013_for_model<- MPI_data_dr_2013 %>%
  dplyr::select(MPI_ndx,predictors) %>% 
  mutate(pop_density = as.numeric(set_units(nb_indv / st_area(.), "1/km2"))) %>% 
  dplyr::select(-nb_indv)
formula <- "log(MPI_ndx) ~ nbr_mng + nbr_dc_p + nbr_dc_m + nbr_dc_sc + nbr_dc_sp + mn_cm_g + pct_cm_ + pop_density "

model1 <- lm(formula = formula, data = MPI_data_dr_2013_for_model)

summary(model1)
## 
## Call:
## lm(formula = formula, data = MPI_data_dr_2013_for_model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.25739 -0.15703  0.01659  0.16114  1.04399 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.384e-01  2.339e-01  -3.584 0.000377 ***
## nbr_mng      7.733e-03  7.553e-04  10.238  < 2e-16 ***
## nbr_dc_p    -9.840e-03  1.992e-03  -4.939 1.13e-06 ***
## nbr_dc_m    -2.005e-02  2.772e-03  -7.232 2.22e-12 ***
## nbr_dc_sc   -1.396e-02  2.812e-03  -4.963 1.01e-06 ***
## nbr_dc_sp   -1.549e-02  1.587e-03  -9.757  < 2e-16 ***
## mn_cm_g     -3.719e-02  4.190e-03  -8.877  < 2e-16 ***
## pct_cm_      6.054e-01  1.565e-01   3.869 0.000126 ***
## pop_density  1.575e-06  8.175e-07   1.926 0.054765 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2692 on 426 degrees of freedom
## Multiple R-squared:  0.6061, Adjusted R-squared:  0.5987 
## F-statistic: 81.93 on 8 and 426 DF,  p-value: < 2.2e-16
library(corrr)

dfw_estimates <- MPI_data_dr_2013_for_model%>%
  select(-MPI_ndx) %>%
  st_drop_geometry()

correlations <- correlate(dfw_estimates, method = "pearson")
network_plot(correlations) + labs(title = "Network plot of correlations between model predictors for Senegal Census 2002")

library(car)

vif(model1)
##     nbr_mng    nbr_dc_p    nbr_dc_m   nbr_dc_sc   nbr_dc_sp     mn_cm_g 
##    3.555007    2.381862    2.434817    2.639560    2.186414    1.097728 
##     pct_cm_ pop_density 
##    1.079553    1.258178

1.3 Dimension reduction with principal components analysis

pca <- prcomp(
  formula = ~., 
  data = dfw_estimates, 
  scale. = TRUE, 
  center = TRUE
)

summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6    PC7
## Standard deviation     1.7393 1.2937 1.0202 0.9148 0.78444 0.58939 0.5261
## Proportion of Variance 0.3781 0.2092 0.1301 0.1046 0.07692 0.04342 0.0346
## Cumulative Proportion  0.3781 0.5874 0.7175 0.8220 0.89897 0.94239 0.9770
##                            PC8
## Standard deviation     0.42903
## Proportion of Variance 0.02301
## Cumulative Proportion  1.00000
pca_tibble <- pca$rotation %>%
  as_tibble(rownames = "predictor")
pca_tibble
## # A tibble: 8 × 9
##   predictor      PC1     PC2      PC3     PC4      PC5     PC6      PC7      PC8
##   <chr>        <dbl>   <dbl>    <dbl>   <dbl>    <dbl>   <dbl>    <dbl>    <dbl>
## 1 nbr_mng     0.468   0.344   7.98e-4 -0.106   8.97e-4 -0.231   0.0192   0.773  
## 2 nbr_dc_p    0.252   0.586  -5.06e-2 -0.280  -3.62e-1 -0.290   0.0705  -0.540  
## 3 nbr_dc_m    0.499   0.0175 -3.74e-2 -0.0967 -4.11e-2  0.692  -0.500   -0.104  
## 4 nbr_dc_sc   0.488  -0.196  -6.42e-3  0.0450  2.07e-1  0.212   0.780   -0.158  
## 5 nbr_dc_sp   0.428  -0.289  -2.65e-2  0.213   4.24e-1 -0.552  -0.368   -0.258  
## 6 mn_cm_g     0.0145 -0.436   2.37e-1 -0.859  -2.17e-2 -0.119  -0.0223   0.0322 
## 7 pct_cm_     0.145  -0.120   8.37e-1  0.311  -4.07e-1 -0.0468 -0.0122  -0.00608
## 8 pop_densi… -0.162   0.459   4.89e-1 -0.140   6.92e-1  0.134  -0.00901 -0.0864
pca_tibble %>%
  select(predictor:PC5) %>%
  pivot_longer(PC1:PC5, names_to = "component", values_to = "value") %>%
  ggplot(aes(x = value, y = predictor)) + 
  geom_col(fill = "darkgreen", color = "darkgreen", alpha = 0.5) + 
  facet_wrap(~component, nrow = 1) + 
  labs(y = NULL, x = "Value", title = " Loadings for first five principal components  for Senegal Census 2002") + 
  theme_minimal()

components <- predict(pca, dfw_estimates)

dfw_pca <- MPI_data_dr_2013_for_model%>%
  select(MPI_ndx) %>%
  cbind(components) 

ggplot(dfw_pca, aes(fill = PC1)) +
  geom_sf(color = NA) +
  labs(title = "Map of principal component 1 for Senegal Census 2002") +
  theme_void() +
  scale_fill_viridis_c()

pca_formula <- paste0("log(MPI_ndx) ~ ", 
                      paste0('PC', 1:6, collapse = ' + '))

pca_model <- lm(formula = pca_formula, data = dfw_pca)

summary(pca_model)
## 
## Call:
## lm(formula = pca_formula, data = dfw_pca)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.41657 -0.17138  0.02118  0.18770  1.31306 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) -2.191601   0.014555 -150.578  < 2e-16 ***
## PC1         -0.110044   0.008378  -13.135  < 2e-16 ***
## PC2          0.158967   0.011264   14.113  < 2e-16 ***
## PC3          0.044954   0.014283    3.147  0.00176 ** 
## PC4          0.086249   0.015929    5.415 1.03e-07 ***
## PC5         -0.058104   0.018576   -3.128  0.00188 ** 
## PC6         -0.033497   0.024723   -1.355  0.17617    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3036 on 428 degrees of freedom
## Multiple R-squared:  0.4968, Adjusted R-squared:  0.4898 
## F-statistic: 70.43 on 6 and 428 DF,  p-value: < 2.2e-16

1.4 Spatial regression

MPI_data_dr_2013_for_model$residuals <- residuals(model1)

ggplot(MPI_data_dr_2013_for_model, aes(x = residuals)) + 
  geom_histogram(bins = 100, alpha = 0.5, color = "navy",
                 fill = "navy") + 
  labs(title = "Distribution of model residuals for Senegal Census 2002") +
  theme_minimal()

library(spdep)

wts <- MPI_data_dr_2013_for_model %>%
  poly2nb() %>%
  nb2listw()

moran.test(MPI_data_dr_2013_for_model$residuals, wts)
## 
##  Moran I test under randomisation
## 
## data:  MPI_data_dr_2013_for_model$residuals  
## weights: wts    
## 
## Moran I statistic standard deviate = 5.0381, p-value = 2.351e-07
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.1417648099     -0.0023041475      0.0008177211
MPI_data_dr_2013_for_model$lagged_residuals <- lag.listw(wts, MPI_data_dr_2013_for_model$residuals)

ggplot(MPI_data_dr_2013_for_model, aes(x = residuals, y = lagged_residuals)) + 
  theme_minimal() + 
  labs(title = "Moran scatterplot of residual spatial autocorrelation for Senegal Census 2002") +
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "lm", color = "red")

2 Geographically weighted regression

2.1 Choosing a bandwidth for GWR

library(GWmodel)
library(sf)

dfw_data_sp <- MPI_data_dr_2013_for_model%>%
  as_Spatial()

bw <- bw.gwr(
  formula = formula, 
  data = dfw_data_sp, 
  kernel = "bisquare",
  adaptive = TRUE
)
## Adaptive bandwidth: 276 CV score: 30.72873 
## Adaptive bandwidth: 179 CV score: 30.5402 
## Adaptive bandwidth: 117 CV score: 31.56878 
## Adaptive bandwidth: 215 CV score: 30.53929 
## Adaptive bandwidth: 240 CV score: 30.59468 
## Adaptive bandwidth: 202 CV score: 30.50807 
## Adaptive bandwidth: 191 CV score: 30.50642 
## Adaptive bandwidth: 187 CV score: 30.51038 
## Adaptive bandwidth: 196 CV score: 30.50472 
## Adaptive bandwidth: 196 CV score: 30.50472
gw_model <- gwr.basic(
  formula = formula, 
  data = dfw_data_sp, 
  bw = bw,
  kernel = "bisquare",
  adaptive = TRUE
)
names(gw_model)
## [1] "GW.arguments"  "GW.diagnostic" "lm"            "SDF"          
## [5] "timings"       "this.call"     "Ftests"
gw_model_results <- gw_model$SDF %>%
  st_as_sf() 

names(gw_model_results)
##  [1] "Intercept"      "nbr_mng"        "nbr_dc_p"       "nbr_dc_m"      
##  [5] "nbr_dc_sc"      "nbr_dc_sp"      "mn_cm_g"        "pct_cm_"       
##  [9] "pop_density"    "y"              "yhat"           "residual"      
## [13] "CV_Score"       "Stud_residual"  "Intercept_SE"   "nbr_mng_SE"    
## [17] "nbr_dc_p_SE"    "nbr_dc_m_SE"    "nbr_dc_sc_SE"   "nbr_dc_sp_SE"  
## [21] "mn_cm_g_SE"     "pct_cm__SE"     "pop_density_SE" "Intercept_TV"  
## [25] "nbr_mng_TV"     "nbr_dc_p_TV"    "nbr_dc_m_TV"    "nbr_dc_sc_TV"  
## [29] "nbr_dc_sp_TV"   "mn_cm_g_TV"     "pct_cm__TV"     "pop_density_TV"
## [33] "Local_R2"       "geometry"
ggplot(gw_model_results, aes(fill = Local_R2)) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c() + 
  labs(title = "Local R-squared values from the GWR model for Senegal Census 2002") +
  theme_void()

ggplot(gw_model_results, aes(fill = nbr_mng)) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c() + 
  labs(title = "Local parameter estimates for household members for Senegal Census 2002") +
  theme_void() + 
  labs(fill = "Local β for \nHH")

ggplot(gw_model_results, aes(fill = pop_density)) + 
  geom_sf(color = NA) + 
  scale_fill_viridis_c() + 
  labs(title = "Local parameter estimates for population density for Senegal Census 2002") +
  theme_void() + 
  labs(fill = "Local β for \npopulation density")

3 Classification and clustering of Senegal census data

3.1 Geodemographic classification

set.seed(1983)

dfw_kmeans <- dfw_pca %>%
  st_drop_geometry() %>%
  select(PC1:PC8) %>%
  kmeans(centers = 6)

table(dfw_kmeans$cluster)
## 
##   1   2   3   4   5   6 
##  20  83 101  89  36 106
dfw_clusters <- dfw_pca %>%
  mutate(cluster = as.character(dfw_kmeans$cluster))

ggplot(dfw_clusters, aes(fill = cluster)) + 
  geom_sf(size = 0.1) + 
  scale_fill_brewer(palette = "Set1") + 
  labs(title = "Map of geodemographic clusters for Senegal Census 2002") +
  theme_void() + 
  labs(fill = "Cluster ")

library(plotly)

cluster_plot <- ggplot(dfw_clusters, 
                       aes(x = PC1, y = PC2, color = cluster)) + 
  geom_point() + 
  scale_color_brewer(palette = "Set1") + 
  labs(title = "Interactive scatterplot of PC1 and PC2 colored by cluster for Senegal Census 2002") +
  theme_minimal()

ggplotly(cluster_plot) %>%
  layout(legend = list(orientation = "h", y = -0.15, 
                       x = 0.2, title = "Cluster"))

3.2 Spatial clustering & regionalization

library(spdep)

input_vars <- dfw_pca %>%
  select(PC1:PC8) %>%
  st_drop_geometry() %>%
  as.data.frame() 

skater_nbrs <- poly2nb(dfw_pca, queen = TRUE)
costs <- nbcosts(skater_nbrs, input_vars)
skater_weights <- nb2listw(skater_nbrs, costs, style = "B")
mst <- mstree(skater_weights)

regions <- skater(
  mst[,1:2], 
  input_vars, 
  ncuts = 7,
  crit = 10
)
dfw_clusters$region <- as.character(regions$group)

ggplot(dfw_clusters, aes(fill = region)) + 
  geom_sf(size = 0.1) + 
  labs(title = "Map of contiguous regions derived with the SKATER algorithm for Senegal Census 2002") +
  scale_fill_brewer(palette = "Set1") + 
  theme_void()